##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:corrgram':
##
## baseball
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x plyr::arrange() masks dplyr::arrange()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x tidyr::extract() masks magrittr::extract()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x plyr::mutate() masks dplyr::mutate()
## x plyr::rename() masks dplyr::rename()
## x purrr::set_names() masks magrittr::set_names()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## Loading required package: vcd
## Loading required package: grid
## Loading required package: gnm
##
## Attaching package: 'gnm'
## The following object is masked from 'package:lattice':
##
## barley
##
## Attaching package: 'vcdExtra'
## The following object is masked from 'package:plotly':
##
## summarise
## The following object is masked from 'package:plyr':
##
## summarise
## The following object is masked from 'package:dplyr':
##
## summarise
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: carData
##
## Attaching package: 'carData'
## The following object is masked from 'package:vcdExtra':
##
## Burt
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1
##
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
##
## ggroc
## ResourceSelection 0.3-5 2019-07-22
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'mice'
## The following object is masked from 'package:aplore3':
##
## nhanes
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: colorspace
##
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
##
## coords
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'dataMaid'
## The following object is masked from 'package:rmarkdown':
##
## render
## The following object is masked from 'package:plyr':
##
## summarize
## The following object is masked from 'package:dplyr':
##
## summarize
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# Utility function
isEmpty <- function(column) {
is.na(column) | column == 0 | column == "" | column == " " | column == "NA" | column == "na" | column == "Na" | column == "nA" | column == "NaN"
}
Import the dataset Note: we make sure that the levels of Attrition are in the right order
CaseStudy2.data <-read.csv("../data/CaseStudy2-data.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
summary(CaseStudy2.data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 15 Min. :1 Min. : 1.0 Min. :1.000
## Life Sciences :358 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Marketing :100 Median :1 Median :1039.0 Median :3.000
## Medical :270 Mean :1 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :1 Max. :2064.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:354 Min. : 30.00 Min. :1.000 Min. :1.000
## Male :516 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 Y:870 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.152 Mean :2.707 Mean :80 Mean :0.7839
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.05 Mean :2.832 Mean :2.782 Mean : 6.962
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 3.000 Median : 1.000 Median : 3.00
## Mean : 4.205 Mean : 2.169 Mean : 4.14
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :18.000 Max. :15.000 Max. :17.00
##
CaseStudy2.data$Attrition <- ordered(CaseStudy2.data$Attrition, levels=c("Yes", "No"))
str(CaseStudy2.data)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Ord.factor w/ 2 levels "Yes"<"No": 2 2 2 2 2 2 2 2 2 2 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
summary(CaseStudy2.data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 Yes:140 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 No :730 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 15 Min. :1 Min. : 1.0 Min. :1.000
## Life Sciences :358 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Marketing :100 Median :1 Median :1039.0 Median :3.000
## Medical :270 Mean :1 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :1 Max. :2064.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:354 Min. : 30.00 Min. :1.000 Min. :1.000
## Male :516 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 Y:870 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.152 Mean :2.707 Mean :80 Mean :0.7839
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.05 Mean :2.832 Mean :2.782 Mean : 6.962
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 3.000 Median : 1.000 Median : 3.00
## Mean : 4.205 Mean : 2.169 Mean : 4.14
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :18.000 Max. :15.000 Max. :17.00
##
CaseStudy2.data$PerformanceRating <- as.factor(CaseStudy2.data$PerformanceRating)
CaseStudy2.data$RelationshipSatisfaction <- as.factor(CaseStudy2.data$RelationshipSatisfaction)
CaseStudy2.data$StockOptionLevel <- as.factor(CaseStudy2.data$StockOptionLevel)
CaseStudy2.data$TrainingTimesLastYear <- as.factor(CaseStudy2.data$TrainingTimesLastYear)
CaseStudy2.data$EnvironmentSatisfaction <- as.factor(CaseStudy2.data$EnvironmentSatisfaction)
CaseStudy2.data$WorkLifeBalance <- as.factor(CaseStudy2.data$WorkLifeBalance)
CaseStudy2.data$Education <- as.factor(CaseStudy2.data$Education)
CaseStudy2.data$PerformanceRating <- as.factor(CaseStudy2.data$PerformanceRating)
CaseStudy2.data$RelationshipSatisfaction <- as.factor(CaseStudy2.data$RelationshipSatisfaction)
CaseStudy2.data$StockOptionLevel <- as.factor(CaseStudy2.data$StockOptionLevel)
CaseStudy2.data$TrainingTimesLastYear <- as.factor(CaseStudy2.data$TrainingTimesLastYear)
CaseStudy2.data$EnvironmentSatisfaction <- as.factor(CaseStudy2.data$EnvironmentSatisfaction)
CaseStudy2.data$JobInvolvement <- as.factor(CaseStudy2.data$JobInvolvement)
CaseStudy2.data$JobSatisfaction <- as.factor(CaseStudy2.data$JobSatisfaction)
CaseStudy2.data$JobLevel <- as.factor(CaseStudy2.data$JobLevel)
# Plot missing data (there should be none)
bdat_mice_clean <- aggr(CaseStudy2.data, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(CaseStudy2.data), cex.axis=.7,
gap=3, ylab=c("Missing Data (distribution)","Missing Data (Pattern)"))
##
## Variables sorted by number of missings:
## Variable Count
## ID 0
## Age 0
## Attrition 0
## BusinessTravel 0
## DailyRate 0
## Department 0
## DistanceFromHome 0
## Education 0
## EducationField 0
## EmployeeCount 0
## EmployeeNumber 0
## EnvironmentSatisfaction 0
## Gender 0
## HourlyRate 0
## JobInvolvement 0
## JobLevel 0
## JobRole 0
## JobSatisfaction 0
## MaritalStatus 0
## MonthlyIncome 0
## MonthlyRate 0
## NumCompaniesWorked 0
## Over18 0
## OverTime 0
## PercentSalaryHike 0
## PerformanceRating 0
## RelationshipSatisfaction 0
## StandardHours 0
## StockOptionLevel 0
## TotalWorkingYears 0
## TrainingTimesLastYear 0
## WorkLifeBalance 0
## YearsAtCompany 0
## YearsInCurrentRole 0
## YearsSinceLastPromotion 0
## YearsWithCurrManager 0
Remove the ID data. Additionally, remove any field which has ONLY 1 value for this exercise.
CaseStudy2.data.NoIdOrUselessData <- CaseStudy2.data %>% dplyr::select(-c("ID", "EmployeeCount", "Over18", "StandardHours"))
Split the data randomly 85% Train data, 15% Test data
set.seed(223)
trainIndex <- createDataPartition(CaseStudy2.data.NoIdOrUselessData$Attrition, p = .80, list = FALSE, times = 1)
CaseStudy2.dtrain <- CaseStudy2.data.NoIdOrUselessData[trainIndex,]
CaseStudy2.dtest <- CaseStudy2.data.NoIdOrUselessData[-trainIndex,]
Note: We assume that this data does not need to be corrected for serial correlation/autocorrelation. In truth, there may be some serial correlation with employees that were hired later or hired earlier, or just in what year there was more atrition. There are many other possible confounding factors such as the economy that we will ignore for the sake of this analysis.
Utility Functions for Confusion Matrix With Custom threshold.
confusionMatrixForCustomThreshold <- function(model, data, threshold, probabilities=NULL) {
if(is.null(probabilities)) {
probabilities <- predict(model, newdata = data, type = "prob")
}
preds2 <- factor(ifelse(probabilities$Yes >= threshold,"Yes", "No"), levels=c("Yes","No"))
CM.Train <- confusionMatrix(preds2, data$Attrition)
return(CM.Train)
}
plotConfusionMatrixByThreshold <- function(model, data, testTitle) {
thresholdSequence <- seq(0, 1, by = 0.001)
accuracy<-c()
sensitivities<-c()
specificities<-c()
probabilities <- predict(model, newdata = data, type = "prob")
for(i in 1:length(thresholdSequence)) {
confusionMatrix <- confusionMatrixForCustomThreshold(model, data, thresholdSequence[i], probabilities)
accuracy[i] <- unname(confusionMatrix$overall['Accuracy'])
sensitivities[i] <- unname(confusionMatrix$byClass['Sensitivity'])
specificities[i] <- unname(confusionMatrix$byClass['Specificity'])
}
plot(x=thresholdSequence, y=accuracy,lty=2,lwd=2,col="red", xlab="threshold", ylab="probability", main=testTitle)
points(x=thresholdSequence, y=sensitivities, col="green")
points(x=thresholdSequence, y=specificities, col="blue")
legend("bottomright", legend=c("Accuracy", "Sensitivity", "Specificity"),
col=c("red", "green", "blue"), lty=1:2, cex=0.8)
}
Note: We have an imbalanced dataset. There are FAR more “No” entries than “Yes” entries for attrition. Further we can see that there is multicolinearity between some of the variables. More on this below.
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=Attrition, fill = Attrition)) + geom_bar() + ggtitle("Attrition Count")
ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=1:6,aes(colour=Attrition), legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=7:11,aes(colour=Attrition), legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=12:17,aes(colour=Attrition), legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=18:22,aes(colour=Attrition), legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=23:32,aes(colour=Attrition), legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
To see whether or not there is any seperation to begin with (amongs the continuous variables in our dataset, we perform PCA)
# PCA will only really work for numerical variables, select only numerical variables.
# Ignore Employee Count, it is always 1. Standard hours is ALWAYS 80. We can ignore that too.
reduced.numerical <- CaseStudy2.data.NoIdOrUselessData %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)
pc.result<-prcomp(reduced.numerical, center= TRUE, scale. = TRUE)
summary(pc.result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0059 1.2956 1.05657 1.0213 1.0165 0.98226 0.97339
## Proportion of Variance 0.2874 0.1199 0.07974 0.0745 0.0738 0.06892 0.06768
## Cumulative Proportion 0.2874 0.4073 0.48706 0.5616 0.6354 0.70428 0.77195
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.94497 0.82918 0.72764 0.69627 0.5331 0.43047 0.35827
## Proportion of Variance 0.06378 0.04911 0.03782 0.03463 0.0203 0.01324 0.00917
## Cumulative Proportion 0.83574 0.88485 0.92267 0.95730 0.9776 0.99083 1.00000
pc.result
## Standard deviations (1, .., p=14):
## [1] 2.0059450 1.2956270 1.0565657 1.0212669 1.0164890 0.9822607 0.9733940
## [8] 0.9449691 0.8291810 0.7276433 0.6962723 0.5331140 0.4304664 0.3582698
##
## Rotation (n x k) = (14 x 14):
## PC1 PC2 PC3 PC4
## Age 0.268657604 -0.46969559 0.03801522 0.004793867
## DailyRate -0.015632448 -0.08607082 0.47290417 -0.244349796
## DistanceFromHome -0.009042931 0.01006789 0.53123059 0.431806807
## EmployeeNumber 0.020613595 -0.01516990 -0.28143762 0.602566807
## HourlyRate 0.009607752 -0.06470642 0.46587922 0.461291620
## MonthlyIncome 0.351601205 -0.30812876 -0.01319624 -0.002860960
## MonthlyRate 0.021193417 -0.11844544 -0.27625186 0.347283050
## NumCompaniesWorked 0.022117913 -0.56513974 -0.03939699 -0.130748171
## PercentSalaryHike -0.044458408 0.02564669 0.33753827 -0.192364833
## TotalWorkingYears 0.419495870 -0.31386835 0.01535436 -0.012617636
## YearsAtCompany 0.444574253 0.21971029 0.01746146 -0.021384145
## YearsInCurrentRole 0.396199671 0.26247011 0.04378918 -0.028553516
## YearsSinceLastPromotion 0.350788132 0.19964615 -0.03554482 0.025563539
## YearsWithCurrManager 0.383134246 0.28794274 0.02765768 -0.033735173
## PC5 PC6 PC7 PC8
## Age -0.04592299 -0.07558154 -0.01097235 -0.024213353
## DailyRate 0.21979058 0.73218168 0.06161879 0.330659252
## DistanceFromHome -0.29716545 -0.33498780 -0.09203440 0.546351080
## EmployeeNumber 0.09095769 0.40608518 -0.61853786 -0.001755972
## HourlyRate 0.38510061 -0.02808084 0.27629416 -0.579177883
## MonthlyIncome -0.06040673 -0.03702276 -0.03223729 0.050688038
## MonthlyRate -0.48506408 0.36752611 0.63293946 0.031555809
## NumCompaniesWorked 0.09485781 -0.01127824 -0.11796538 -0.045494002
## PercentSalaryHike -0.67297948 0.16823587 -0.33247393 -0.499254233
## TotalWorkingYears -0.03053411 -0.05581647 -0.02981403 0.014846964
## YearsAtCompany 0.01029127 0.01568290 -0.01020871 -0.009857063
## YearsInCurrentRole -0.02222431 0.09280639 0.02815357 -0.014082742
## YearsSinceLastPromotion 0.04139600 -0.02169833 0.02586783 -0.021490465
## YearsWithCurrManager 0.02487533 0.05445161 -0.01111947 -0.016163807
## PC9 PC10 PC11 PC12
## Age -0.199671381 -0.310612359 0.704909667 0.031703624
## DailyRate -0.056423087 -0.082806448 -0.017912911 -0.029974271
## DistanceFromHome 0.150494611 0.016873575 -0.014730989 -0.008062339
## EmployeeNumber -0.016537886 -0.006804817 0.027588072 0.011499996
## HourlyRate -0.005154642 0.046212685 -0.052634484 -0.001143078
## MonthlyIncome -0.427504712 0.217762765 -0.541763959 -0.007841276
## MonthlyRate 0.111705888 0.023929120 -0.010011862 -0.045563589
## NumCompaniesWorked 0.757355597 0.146377660 -0.138581967 -0.009590849
## PercentSalaryHike 0.010275009 -0.048174904 -0.046362033 -0.028479294
## TotalWorkingYears -0.147735635 0.048416249 -0.104994783 -0.039311910
## YearsAtCompany 0.008429314 0.077395487 0.005471988 -0.001417094
## YearsInCurrentRole 0.212843634 0.221997633 0.133424346 0.746451728
## YearsSinceLastPromotion 0.247975978 -0.798238146 -0.302495624 -0.100200306
## YearsWithCurrManager 0.190725312 0.357287941 0.252409719 -0.652747705
## PC13 PC14
## Age 0.1633557935 0.1904632816
## DailyRate -0.0145624378 0.0002688141
## DistanceFromHome 0.0003815934 0.0210793918
## EmployeeNumber -0.0044771135 -0.0068717557
## HourlyRate 0.0039682314 0.0078256926
## MonthlyIncome 0.4104839924 0.2835049859
## MonthlyRate -0.0309552690 0.0210919371
## NumCompaniesWorked -0.0103578652 0.1481613000
## PercentSalaryHike -0.0066217009 -0.0004830507
## TotalWorkingYears -0.4468076482 -0.6954001992
## YearsAtCompany -0.6389702137 0.5817127252
## YearsInCurrentRole 0.2643059279 -0.1478828957
## YearsSinceLastPromotion 0.1711834086 -0.0483017202
## YearsWithCurrManager 0.3104497107 -0.1192701687
pc.scores<-data.frame(pc.result$x)
pc.scores$Attrition= as.factor(CaseStudy2.data$Attrition)
plot_ly(x = pc.scores$PC1, y = pc.scores$PC2, z = pc.scores$PC3, type="scatter3d", mode="markers", color=pc.scores$Attrition, title="Principal Components 1-3")
## Warning: 'scatter3d' objects don't have these attributes: 'title'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'y', 'z', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'mode', 'surfaceaxis', 'surfacecolor', 'projection', 'connectgaps', 'line', 'marker', 'textposition', 'textfont', 'hoverinfo', 'error_x', 'error_y', 'error_z', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter3d' objects don't have these attributes: 'title'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'y', 'z', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'mode', 'surfaceaxis', 'surfacecolor', 'projection', 'connectgaps', 'line', 'marker', 'textposition', 'textfont', 'hoverinfo', 'error_x', 'error_y', 'error_z', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
ggpairs(pc.scores,columns=1:5,aes(colour=Attrition), legend =1, , progress = FALSE)
## Cluster and HeatMap This HeatMap is used to show similarities between data with a non-parameteric hierarchical cluster. Our distance metric is still euclidean, so the clustering of categorical variables may not be totally accurate.
pheatmap::pheatmap(data.matrix(CaseStudy2.data.NoIdOrUselessData), scale = "column")
## Multicolinearity We can see that there are high GVIF values for certain predictors in our model. GVIF is used so that we may include categorical variables as well. There is conflicting information on what to do with GVIF (rules of thumb). We will just keep the high VIF values in mind when building plots to show multicolinarity below.
CaseStudy2.data.NoIdOrUselessData.Copy <- data.frame(CaseStudy2.data.NoIdOrUselessData)
CaseStudy2.data.NoIdOrUselessData.Copy$AgeDummy <- CaseStudy2.data.NoIdOrUselessData.Copy$Age
data.table(vif(lm(AgeDummy~., data=CaseStudy2.data.NoIdOrUselessData.Copy)))
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
## GVIF Df GVIF^(1/(2*Df))
## 1: 2.060730 1 1.435524
## 2: 1.549002 1 1.244589
## 3: 1.185951 2 1.043558
## 4: 1.076613 1 1.037599
## 5: 121.573870 2 3.320550
## 6: 1.113235 1 1.055099
## 7: 1.508926 4 1.052770
## 8: 3.473377 5 1.132596
## 9: 1.096800 1 1.047282
## 10: 1.263808 3 1.039793
## 11: 1.065564 1 1.032261
## 12: 1.075300 1 1.036967
## 13: 1.304695 3 1.045325
## 14: 61.766716 4 1.674343
## 15: 2194.770794 8 1.617472
## 16: 1.286452 3 1.042875
## 17: 3.204596 2 1.337961
## 18: 22.471085 1 4.740368
## 19: 1.088645 1 1.043382
## 20: 1.445871 1 1.202444
## 21: 1.176766 1 1.084789
## 22: 2.709157 1 1.645952
## 23: 2.689863 1 1.640080
## 24: 1.263835 3 1.039797
## 25: 3.534365 3 1.234199
## 26: 5.936916 1 2.436579
## 27: 1.561017 6 1.037809
## 28: 1.243624 3 1.037007
## 29: 5.461741 1 2.337037
## 30: 3.113613 1 1.764543
## 31: 1.959381 1 1.399779
## 32: 2.922524 1 1.709539
## GVIF Df GVIF^(1/(2*Df))
Variable Importance After further analysis, we can see that, when predicting Attrition, that Random Forests seem to perform the best for the models we’ve tried. Since Random Forest performs the best for our Attrition Classification models, we will use it’s ranking of variable importance to guide our comparison of mutivariate (categorical and numerical) variable importance.
ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5)
fit2.HigherComplexity.Rf <- train(Attrition ~ ., data = CaseStudy2.data.NoIdOrUselessData, method = 'rf', trControl = ctrl, ntree = 200, metric = "ROC")
fit2.HigherComplexity.Rf$metric
## [1] "ROC"
fit2.HigherComplexity.Rf
## Random Forest
##
## 870 samples
## 31 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 696, 696, 696, 696, 696
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7656800 0.03571429 1.0000000
## 35 0.7785959 0.22142857 0.9780822
## 68 0.7587573 0.25714286 0.9767123
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 35.
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")
threshold <- .4
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 112 0
## No 0 584
##
## Accuracy : 1
## 95% CI : (0.9947, 1)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1609
## Detection Rate : 0.1609
## Detection Prevalence : 0.1609
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")
print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 28 0
## No 0 146
##
## Accuracy : 1
## 95% CI : (0.979, 1)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 5.519e-14
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1609
## Detection Rate : 0.1609
## Detection Prevalence : 0.1609
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##
summary(fit2.HigherComplexity.Rf$finalModel)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 870 factor numeric
## err.rate 600 -none- numeric
## confusion 6 -none- numeric
## votes 1740 matrix numeric
## oob.times 870 -none- numeric
## classes 2 -none- character
## importance 68 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 870 ordered numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 68 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 1 -none- list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest')
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 870
## Number of groups: 1
## Observations per group: 870
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 730
## Negative: 140
## ***Performance Metrics***
## Train Optimal Informedness = 0.421624266144814
## Train AUC-ROC = 0.77
plot(varImp(fit2.HigherComplexity.Rf, scale = FALSE), top = 8, main="Variable Importance: Random Forest")
Here we plot many of the “top” variables against one another to show multicolinearity between some of the different variables. Notice that ‘MonthlyIncome’ is highly colinear with ‘TotalWorkingYears’. It is also highly multicolinear with ‘YearsAtCompany’. As such, we can pick MonthlyIncome and leave out TotalWorkingYears as well as YearsAtCompany from the “top factors” list.
Top10VariableData <- CaseStudy2.data.NoIdOrUselessData %>% dplyr::select(Attrition, MonthlyIncome, OverTime, Age, JobRole, DailyRate, TotalWorkingYears, EmployeeNumber, MonthlyRate, DistanceFromHome)
ggpairs(Top10VariableData,aes(colour=Attrition), columns=2:9, legend =1, progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=Age, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. Age")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=TotalWorkingYears, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. TotalWorkingYears")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=YearsAtCompany, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. YearsAtCompany")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=DailyRate, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. DailyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=MonthlyRate, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. MonthlyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=HourlyRate, y=MonthlyRate, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyRate vs. HourlyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Variable Importance, Multi-colinearity corrected. Now that we know which continuous variables we can remove due to multi-colinearity, let’s re-fit the random forest and look at variable importance a second time.
ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5)
fit2.HigherComplexity.Rf <- train(Attrition ~ . -I(TotalWorkingYears) -I(YearsAtCompany), data = CaseStudy2.data.NoIdOrUselessData, method = 'rf', trControl = ctrl, ntree = 200, metric = "ROC")
fit2.HigherComplexity.Rf$metric
## [1] "ROC"
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")
threshold <- .4
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 112 0
## No 0 584
##
## Accuracy : 1
## 95% CI : (0.9947, 1)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1609
## Detection Rate : 0.1609
## Detection Prevalence : 0.1609
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")
print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 28 0
## No 0 146
##
## Accuracy : 1
## 95% CI : (0.979, 1)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 5.519e-14
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1609
## Detection Rate : 0.1609
## Detection Prevalence : 0.1609
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##
summary(fit2.HigherComplexity.Rf$finalModel)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 870 factor numeric
## err.rate 600 -none- numeric
## confusion 6 -none- numeric
## votes 1740 matrix numeric
## oob.times 870 -none- numeric
## classes 2 -none- character
## importance 68 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 870 ordered numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 68 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 1 -none- list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest')
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 870
## Number of groups: 1
## Observations per group: 870
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 730
## Negative: 140
## ***Performance Metrics***
## Train Optimal Informedness = 0.454500978473581
## Train AUC-ROC = 0.78
plot(varImp(fit2.HigherComplexity.Rf, scale = FALSE), top = 5, main="Variable Importance: Random Forest")
We now classify with Random Forest with ALl variables. Since we only care about prediction and not interpretation, we can just use all the multicolinear variables and not worry about it. Note: we tune the model to use a threshold of 0.38 to account for the imbalance in our dataset.
ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 10)
fit2.HigherComplexity.Rf <- train(Attrition ~ ., data = CaseStudy2.dtrain, method = 'rf', trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")
threshold <- .145
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.145"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 112 24
## No 0 560
##
## Accuracy : 0.9655
## 95% CI : (0.9491, 0.9778)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8825
##
## Mcnemar's Test P-Value : 2.668e-06
##
## Sensitivity : 1.0000
## Specificity : 0.9589
## Pos Pred Value : 0.8235
## Neg Pred Value : 1.0000
## Prevalence : 0.1609
## Detection Rate : 0.1609
## Detection Prevalence : 0.1954
## Balanced Accuracy : 0.9795
##
## 'Positive' Class : Yes
##
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")
print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.145"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 20 56
## No 8 90
##
## Accuracy : 0.6322
## 95% CI : (0.5559, 0.7039)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1954
##
## Mcnemar's Test P-Value : 4.228e-09
##
## Sensitivity : 0.7143
## Specificity : 0.6164
## Pos Pred Value : 0.2632
## Neg Pred Value : 0.9184
## Prevalence : 0.1609
## Detection Rate : 0.1149
## Detection Prevalence : 0.4368
## Balanced Accuracy : 0.6654
##
## 'Positive' Class : Yes
##
summary(fit2.HigherComplexity.Rf$finalModel)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 696 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1392 matrix numeric
## oob.times 696 -none- numeric
## classes 2 -none- character
## importance 68 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 696 ordered numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 68 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 0 -none- list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest')
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***
## Train Optimal Informedness = 0.452788649706458
## Train AUC-ROC = 0.78
pred2 <- predict(fit2.HigherComplexity.Rf, newdata = CaseStudy2.dtest, type="prob")
test1 <- evalm(data.frame(pred2, CaseStudy2.dtest$Attrition), title='Test ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 174
## Number of groups: 1
## Observations per group: 174
## Positive: No
## Negative: Yes
## Group: Group1
## Positive: 146
## Negative: 28
## ***Performance Metrics***
## Group1 Optimal Informedness = 0.37426614481409
## Group1 AUC-ROC = 0.7
Note it appears that Logistic Regression with LASSO L1-regularization is absolutely outperformed by RandomForests.
## Logistic Regression: Fit Model -
ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5, search="grid")
fit2.HigherComplexity.Lasso <- train(Attrition ~ (.)^2 + I(Age^2) + I(DailyRate^2)+ I(DistanceFromHome^2)+ I(EmployeeNumber^2)+ I(HourlyRate^2)+ I(MonthlyIncome^2)+ I(MonthlyRate^2)+ I(NumCompaniesWorked^2)+ I(PercentSalaryHike^2) + I(TotalWorkingYears^2) + I(YearsAtCompany^2), data = CaseStudy2.dtrain, method = 'glmnet', trControl = ctrl, metric = "ROC", tuneGrid = data.frame(alpha = 1, lambda = 10^seq(-2, -1.75, by = 0.0001)))
summary(fit2.HigherComplexity.Lasso)
## Length Class Mode
## a0 100 -none- numeric
## beta 227100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 2271 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## param 0 -none- list
coefficients <- coef(fit2.HigherComplexity.Lasso$finalModel, fit2.HigherComplexity.Lasso$bestTune$lambda)
# coefficients@Dimnames[[1]][which(coefficients > .0001 |coefficients < -.0001 )]
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Lasso")
threshold <- .5
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.5"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 77 0
## No 35 584
##
## Accuracy : 0.9497
## 95% CI : (0.9308, 0.9647)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7869
##
## Mcnemar's Test P-Value : 9.081e-09
##
## Sensitivity : 0.6875
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9435
## Prevalence : 0.1609
## Detection Rate : 0.1106
## Detection Prevalence : 0.1106
## Balanced Accuracy : 0.8438
##
## 'Positive' Class : Yes
##
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtest, "Test Confusion Matrix Stats - Lasso")
print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.5"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 8 4
## No 20 142
##
## Accuracy : 0.8621
## 95% CI : (0.8018, 0.9096)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.2386
##
## Kappa : 0.3359
##
## Mcnemar's Test P-Value : 0.0022
##
## Sensitivity : 0.28571
## Specificity : 0.97260
## Pos Pred Value : 0.66667
## Neg Pred Value : 0.87654
## Prevalence : 0.16092
## Detection Rate : 0.04598
## Detection Prevalence : 0.06897
## Balanced Accuracy : 0.62916
##
## 'Positive' Class : Yes
##
# ROC Plot
res <- evalm(fit2.HigherComplexity.Lasso,gnames=c('Train'), title='ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***
## Train Optimal Informedness = 0.500978473581213
## Train AUC-ROC = 0.81
#Train
res <- evalm(fit2.HigherComplexity.Lasso,gnames=c('Train'), title='Train ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***
## Train Optimal Informedness = 0.500978473581213
## Train AUC-ROC = 0.81
#Test
pred2 <- predict(fit2.HigherComplexity.Lasso, newdata = CaseStudy2.dtest, type="prob")
test1 <- evalm(data.frame(pred2, CaseStudy2.dtest$Attrition), title='Test ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 174
## Number of groups: 1
## Observations per group: 174
## Positive: No
## Negative: Yes
## Group: Group1
## Positive: 146
## Negative: 28
## ***Performance Metrics***
## Group1 Optimal Informedness = 0.487279843444227
## Group1 AUC-ROC = 0.78
summary(fit2.HigherComplexity.Lasso)
## Length Class Mode
## a0 100 -none- numeric
## beta 227100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 2271 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## param 0 -none- list
ctrl <- trainControl(method="cv", number = 5, search="grid")
fit2.HigherComplexity.Lasso.Regression <- train(MonthlyIncome ~ (.)^2 + I(Age^2) + I(DailyRate^2)+ I(DistanceFromHome^2)+ I(EmployeeNumber^2)+ I(HourlyRate^2)+ I(MonthlyIncome^2)+ I(MonthlyRate^2)+ I(NumCompaniesWorked^2)+ I(PercentSalaryHike^2) + I(TotalWorkingYears^2) + I(YearsAtCompany^2), data = CaseStudy2.dtrain, method = 'glmnet', trControl = ctrl, tuneGrid = data.frame(alpha = 1, lambda = 10^seq(-2, -1.75, by = 0.0001)))
summary(fit2.HigherComplexity.Lasso.Regression)
## Length Class Mode
## a0 100 -none- numeric
## beta 227100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 2271 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
coefficients <- coef(fit2.HigherComplexity.Lasso.Regression$finalModel, fit2.HigherComplexity.Lasso.Regression$bestTune$lambda)
train_prediction=predict(fit2.HigherComplexity.Lasso.Regression, CaseStudy2.dtrain)
train_ASE<-mean((CaseStudy2.dtrain$MonthlyIncome-train_prediction)^2)
print("Train RSME")
## [1] "Train RSME"
(train_ASE)^0.5
## [1] 483.6119
test_prediction=predict(fit2.HigherComplexity.Lasso.Regression, CaseStudy2.dtest)
test_ASE<-mean((CaseStudy2.dtest$MonthlyIncome-test_prediction)^2)
print("Test RSME")
## [1] "Test RSME"
(test_ASE)^0.5
## [1] 495.8135
print("Use this method! RSME is acceptable!")
## [1] "Use this method! RSME is acceptable!"
Note Caret selects (via Cross-validation) k=5 as the optimal k-value (using RSME as the metric and cross-validation). Unfortunately, BOTH the Train and Test RSME are BAD! This is likely because we’ve only used numerical variables in our model (since they are the only ones that work for KNN)
reduced.numerical.train <- CaseStudy2.dtrain %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)
reduced.numerical.test <- CaseStudy2.dtest %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)
ctrl <- trainControl(method="repeatedcv",repeats = 5)
knnFit <- train(MonthlyIncome ~ ., data = reduced.numerical.train, method = "knn", trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)
train_prediction=predict(knnFit, reduced.numerical.train)
train_ASE<-mean((reduced.numerical.train$MonthlyIncome-train_prediction)^2)
print("Train RSME")
## [1] "Train RSME"
(train_ASE)^0.5
## [1] 2944.824
test_prediction=predict(knnFit, reduced.numerical.test)
test_ASE<-mean((reduced.numerical.test$MonthlyIncome-test_prediction)^2)
print("Test RSME")
## [1] "Test RSME"
(test_ASE)^0.5
## [1] 3401.035
print("RSME is BAD for KNN!!! DON'T USE Regression")
## [1] "RSME is BAD for KNN!!! DON'T USE Regression"